## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 529929 28.4 1183891 63.3 660382 35.3
## Vcells 970766 7.5 8388608 64.0 1770494 13.6
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 530163 28.4 1183891 63.3 660382 35.3
## Vcells 971249 7.5 8388608 64.0 1770494 13.6
# BiocManager::install("mixOmicsTeam/mixOmics@devel")
library(mixOmics)
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("BiocParallel")
library(BiocParallel)fp = file.path('..', 'input')
fn = 'Enzymomics_only_24_shortnames.txt'
E = read.delim(file = file.path(fp, fn),
header = TRUE,
sep = "\t",
quote = NULL,
dec = ".",
fill = TRUE,
comment.char = '@')
fn = 'Metabolomics_only_24_shortnames.txt'
M = read.delim(file = file.path(fp, fn),
header = TRUE,
sep = "\t",
quote = NULL,
dec = ".",
fill = TRUE,
comment.char = '@')
# fn = 'phosphoProteomics_filtered_sig_24_shortnames.txt'
# fn = 'phosphoProteomics_filtered_sig_24_shortnames_diablo.csv'
fn = 'phosphoProteomics_filtered_sig_24_shortnames.csv'
P = read.delim(file = file.path(fp, fn),
header = TRUE,
sep = ',', # "\t", # ','
quote = NULL,
dec = ".",
fill = TRUE,
comment.char = '@')
all(P$treatment == E$treatment)## [1] TRUE
## [1] TRUE
## [1] 7.57635 23.40680
## [1] 8.251654e+02 1.365161e+07
## [1] 16.63 67335.00
# range(log(M[, -1], 10), na.rm = TRUE)
# range(log(E[, -1], 10), na.rm = TRUE)
range(log(M[, -1], 2), na.rm = TRUE)## [1] 9.68854 23.70257
## [1] 4.055716 16.039069
# PBSmodelling::plotDens(M[, -1])
# PBSmodelling::plotDens(log(M[, -1], 2))
tmp = tidyr::gather(M, var, measurement, colnames(M)[2]:colnames(M)[ncol(M)],
factor_key = FALSE)
range(tmp$measurement, na.rm = TRUE)## [1] 8.251654e+02 1.365161e+07
ggplot(tmp, aes(x=measurement, fill=var)) +
geom_density(alpha = 0.1) +
theme(legend.position="none")## [1] 9.68854 23.70257
ggplot(tmp, aes(x=measurement, fill=var)) +
geom_density(alpha = 0.1) +
theme(legend.position="none")tmp = tidyr::gather(E, var, measurement, colnames(E)[2]:colnames(E)[ncol(E)],
factor_key = FALSE)
range(tmp$measurement, na.rm = TRUE)## [1] 16.63 67335.00
ggplot(tmp, aes(x=measurement, fill=var)) +
geom_density(alpha = 0.1) +
theme(legend.position="none")## [1] 4.055716 16.039069
ggplot(tmp, aes(x=measurement, fill=var)) +
geom_density(alpha = 0.1) +
theme(legend.position="none")numeric table
https://mixomics.org/mixdiablo/diablo-tcga-case-study/
## $Enz
## [1] 15 24
##
## $Met
## [1] 15 52
##
## $Pph
## [1] 15 39
## NM_24 NM_inf_24 AM_24 AM_inf_24
## 4 4 3 4
Circle Correlation Plots for pairwise PLS models
Only displays the top 24 features for each dimension, subsetting by those with a correlation above 0.5
# 24 is the smalles data set here
no = min(unlist(lapply(data, ncol)))
list.keepX = c(no, no) # select arbitrary values of features to keep
list.keepY = c(no, no)
# generate three pairwise PLS models
pls1 <- spls(data[["Enz"]], data[["Met"]],
keepX = list.keepX, keepY = list.keepY)
pls2 <- spls(data[["Enz"]], data[["Pph"]],
keepX = list.keepX, keepY = list.keepY)
pls3 <- spls(data[["Met"]], data[["Pph"]],
keepX = list.keepX, keepY = list.keepY)
# plot features of first PLS
plotVar(pls1, cutoff = 0.5, title = "(a) enzymomics vs metabolomics",
legend = c("enzymomics", "metabolomics"),
var.names = TRUE, style = 'graphics',
pch = c(16, 17), cex = c(0.5,0.5),
col = c('darkorchid', 'lightgreen'))plotVar(pls1, cutoff = 0.5, title = "(a) enzymomics vs metabolomics",
legend = c("enzymomics", "metabolomics"),
var.names = FALSE, style = 'graphics',
pch = c(16, 17), cex = c(2,2),
col = c('darkorchid', 'lightgreen'))# plot features of second PLS
plotVar(pls2, cutoff = 0.5, title = "(b) enzymomics vs phosphoproteomics",
legend = c("enzymomics", "phosphoproteomics"),
var.names = TRUE, style = 'graphics',
pch = c(16, 17), cex = c(0.5,0.5),
col = c('darkorchid', 'blue'))plotVar(pls2, cutoff = 0.5, title = "(b) enzymomics vs phosphoproteomics",
legend = c("enzymomics", "phosphoproteomics"),
var.names = FALSE, style = 'graphics',
pch = c(16, 17), cex = c(2,2),
col = c('darkorchid', 'blue'))# plot features of third PLS
plotVar(pls3, cutoff = 0.5, title = "(c) metabolomics vs phosphoproteomics",
legend = c("metabolomics", "phosphoproteomics"),
var.names = TRUE, style = 'graphics',
pch = c(16, 17), cex = c(0.5,0.5),
col = c('lightgreen', 'blue'))plotVar(pls3, cutoff = 0.5, title = "(c) metabolomics vs phosphoproteomics",
legend = c("metabolomics", "phosphoproteomics"),
var.names = FALSE, style = 'graphics',
pch = c(16, 17), cex = c(2,2),
col = c('lightgreen', 'blue'))## comp1 comp2
## comp1 0.6910261 1.595938e-05
## comp2 0.5273285 8.555415e-01
## comp1 comp2
## comp1 0.8760157 0.008333107
## comp2 -0.2952379 0.850452671
## comp1 comp2
## comp1 0.8036788 0.003380316
## comp2 0.4032532 0.883495643
design = matrix(0.5, ncol = length(data), nrow = length(data), # for square matrix filled with 0.1s
dimnames = list(names(data), names(data)))
diag(design) = 0 # set diagonal to 0s
design## Enz Met Pph
## Enz 0.0 0.5 0.5
## Met 0.5 0.0 0.5
## Pph 0.5 0.5 0.0
Choosing the number of components in block.plsda using perf() with 10 × 10-fold CV function
10-fold Cross-validation & leave-one-out cross-validation: https://stats.stackexchange.com/questions/154830/10-fold-cross-validation-vs-leave-one-out-cross-validation
## ---- fig.cap = "FIGURE 2: Choosing the number of components in `block.plsda` using `perf()` with 10 × 10-fold CV function in the `breast.TCGA` study. Classification error rates (overall and balanced, see Section 7.3) are represented on the y-axis with respect to the number of components on the x-axis for each prediction distance presented in PLS-DA"----
perf.diablo = perf(basic.diablo.model,
dist = 'all',
cpus = 1,
validation = 'loo',
progressBar = TRUE,
folds = 10,
nrepeat = 10) # run component number tuning with repeated CV##
## Performing repeated cross-validation with nrepeat = 10...
##
|
| | 0%
|
|======= | 10%
|
|============== | 20%
|
|===================== | 30%
|
|============================ | 40%
|
|=================================== | 50%
|
|========================================== | 60%
|
|================================================= | 70%
|
|======================================================== | 80%
|
|=============================================================== | 90%
|
|======================================================================| 100%
ncomp = perf.diablo$choice.ncomp$WeightedVote["Overall.BER", "centroids.dist"] # set the optimal ncomp value
perf.diablo$choice.ncomp$WeightedVote # show the optimal choice for ncomp for each dist metric## max.dist centroids.dist mahalanobis.dist
## Overall.ER 5 3 1
## Overall.BER 5 3 2
We choose the optimal number of variables to select in each data set using the tune.block.splsda() function, for a grid of keepX values for each type of omics. Note that the function has been set to favour a relatively small signature while allowing us to obtain a sufficient number of variables for downstream validation and/or interpretation.
https://rdrr.io/bioc/BiocParallel/man/BiocParallelParam-class.html
bplapply erro solution https://support.bioconductor.org/p/133353/#133356
options(SnowParam=SnowParam(workers=1))
BPPARAM = bpparam()
BPPARAM <- BiocParallel::SnowParam(workers = parallel::detectCores()-1)Crate tune.MIR with tune.block.splsda
and save RData (beacuse it takes cca 1h)
then comment these lines
and just load RData
10-fold Cross-validation & leave-one-out cross-validation: https://stats.stackexchange.com/questions/154830/10-fold-cross-validation-vs-leave-one-out-cross-validation
# set grid of values for each component to test
# test.keepX = list (Met = c(5:9, seq(10, 15, 2), seq(20,24,4)),
# Enz = c(5:9, seq(10, 15, 2), seq(20,24,4)),
# Pph = c(5:9, seq(10, 15, 2), seq(20,24,4)))
x <- list()
for (i in 1:length(data)){
x[[i]] <- c( seq(5,min(24, ncol(data[[i]])) ,2))
}
names(x) <- names(data)
test.keepX <- x
test.keepX## $Enz
## [1] 5 7 9 11 13 15 17 19 21 23
##
## $Met
## [1] 5 7 9 11 13 15 17 19 21 23
##
## $Pph
## [1] 5 7 9 11 13 15 17 19 21 23
# You have provided a sequence of keepX of length: 10 for block Met and 10 for block Enz and 10 for block Pph.
# This results in 1000 models being fitted for each component and each nrepeat, this may take some time to run, be patient!
# Because of a too high number of 'folds' required, 2 folds were randomly assigned no data: the number of 'folds' is reduced to 8
t1 = proc.time()
# run the feature selection tuning
library(stats)
tune.MIR = tune.block.splsda(X = data, Y = Y,
ncomp = ncomp,
test.keepX = test.keepX,
design = design,
validation = 'loo',
folds = 10, nrepeat = 10,
BPPARAM = BPPARAM ,
dist = "centroids.dist",
max.iter = 200)
t2 = proc.time()
running_time = t2 - t1; running_time## user system elapsed
## 1.91 0.44 1159.11
fp = file.path('..', 'output')
fn = '01_DIABLO-Enz-Met-Pph_tune.MIR.RData'
# save(tune.MIR, file = file.path(fp, fn))
load(file.path(fp, fn))
list.keepX = tune.MIR$choice.keepX
list.keepX## $Enz
## [1] 5 5 5
##
## $Met
## [1] 19 15 21
##
## $Pph
## [1] 13 9 5
The number of features to select on each component is returned in
## $Enz
## [1] 5 5 5
##
## $Met
## [1] 19 15 21
##
## $Pph
## [1] 13 9 5
# set the optimised DIABLO model
final.diablo.model = block.splsda(X = data, Y = Y, ncomp = ncomp
, keepX = list.keepX
, design = design)
## Design matrix has changed to include Y; each block will be
## linked to Y.# the features selected from components
for (comp in 1:ncomp){
cat("\nComponent ", comp,":\n")
for(i in 1:length(data)){
cat(names(data)[i],"\n")
print(selectVar(final.diablo.model, comp = comp)[[i]]$name)
}
}##
## Component 1 :
## Enz
## [1] "MDHt.NADP." "GDH.NAD." "max_Rubisco" "PK" "Ppi"
## Met
## [1] "X1.Methylnicotinamide" "X6.Hydroxypurine"
## [3] "X2.Phenylacetamide" "Homoplantaginin"
## [5] "Spinosin" "Plantagoside"
## [7] "Flavone" "Riboflavin"
## [9] "DHDMIF.o.glu" "X6.Benzylaminopurine"
## [11] "Acanthoside.B" "Thr"
## [13] "Arginine" "X2..Deoxyguanosine.5..monophosphate"
## [15] "Luteolin" "Tetrahydropiperine"
## [17] "Eriocitrin" "Phe"
## [19] "Bavachin"
## Pph
## [1] "TOM1" "LAG1_LP" "zf.B_box" "URK" "GTs"
## [6] "zf_A20.1" "RING_typeE3s" "AURPU" "LHT8" "LOX"
## [11] "DUF3511" "NP24pp" "AOS"
##
## Component 2 :
## Enz
## [1] "GK" "KHK" "MDHi..NADP." "FBPase.cyt." "AGPase"
## Met
## [1] "Solasodine" "Hydrocortisone"
## [3] "Eriocitrin" "Isorhamnetin.3.O.neohesperidin"
## [5] "Coumarine" "Corticosterone"
## [7] "Nobiletin" "Neohesperidin"
## [9] "X5.methoxyflavone" "X3.Dehidroshikimate"
## [11] "Liquiritigenin" "Thr"
## [13] "Rutin" "X4.Coumaryl.alcohol"
## [15] "Apigenin.7.glucoside"
## Pph
## [1] "PR1" "X5." "GGH" "GR_RBP3" "SQLE" "Chitinase"
## [7] "DCL3" "PK" "VPS33A"
##
## Component 3 :
## Enz
## [1] "NAD_GAPDH" "DH" "max_Rubisco" "PGIt" "NT"
## Met
## [1] "Quercitrin" "Nobiletin"
## [3] "Isorhamnetin.3.O.neohesperidin" "X6.Benzylaminopurine"
## [5] "X2..Deoxyguanosine.5..monophosphate" "Guaijaverin"
## [7] "Tetrahydropiperine" "Narirutin"
## [9] "Neohesperidin" "Trigonelline"
## [11] "Vitexicarpin" "Tomatine"
## [13] "Liquiritigenin" "X5.Demethylnobiletin"
## [15] "Plantagoside" "Hydrocortisone"
## [17] "Bavachin" "DHDMIF.o.glu"
## [19] "Thr" "Flavone"
## [21] "X5.methoxyflavone"
## Pph
## [1] "PKMTs" "VPS33A" "CHI17" "GR_RBP3" "DSlP"
for(comp in 1:ncomp){
plotDiablo(final.diablo.model, ncomp = comp)
title(paste("Component",comp), adj=0.1, line=-1, outer=TRUE)
}plind <- plotIndiv(final.diablo.model, ind.names = FALSE, legend = TRUE,
title = 'DIABLO Sample Plots'
, ellipse = TRUE,
comp = c(1,3)
)plind <- plotIndiv(final.diablo.model, ind.names = FALSE, legend = TRUE,
title = 'DIABLO Sample Plots'
, ellipse = TRUE,
comp = c(1,2)
)plind <- plotIndiv(final.diablo.model, ind.names = FALSE, legend = TRUE,
title = 'DIABLO Sample Plots'
, ellipse = TRUE,
comp = c(2,3)
)plotArrow(final.diablo.model, ind.names = FALSE, legend = TRUE,
title = paste(groups,collapse=", ")
)if(length(data)==3) pick <- 1:3 else pick <- c(4,1:3)
cols <- c('orange1', 'brown1', 'lightgreen',"lightblue")[pick]
pchs <- c(16, 17, 15, 18)[pick]
plotVar(final.diablo.model, var.names = FALSE,
style = 'graphics', legend = TRUE
, pch = pchs, cex = rep(2,length(data))
, col = cols
)plotVar(final.diablo.model, var.names = TRUE,
style = 'graphics', legend = TRUE
, pch = pchs, cex = rep(0.5,length(data))
, col = cols
)cutoff = 0.50
circosPlot(final.diablo.model, cutoff = cutoff, line = TRUE,
color.blocks= cols,
color.cor = c(3,2), size.labels = 1
, xpd=TRUE)##
## adding block name to feature names in the output similarity matrix as there are similar feature names across blocks.
cutoff = 0.90
circosPlot(final.diablo.model, cutoff = cutoff, line = TRUE,
color.blocks= cols,
color.cor = c(3,2), size.labels = 1
, xpd=TRUE)##
## adding block name to feature names in the output similarity matrix as there are similar feature names across blocks.
cutoff = 0.75
circosPlot(final.diablo.model, cutoff = cutoff, line = TRUE,
color.blocks= cols,
color.cor = c(3,2), size.labels = 1
, xpd=TRUE)##
## adding block name to feature names in the output similarity matrix as there are similar feature names across blocks.
issue https://github.com/mixOmicsTeam/mixOmics/issues/45
# plotLoadings encountered margin errors. Ensure feature names are not too long (see 'name.var' argument) and the 'Plots' pane is cleared and enlargened.
traceback()## No traceback available
cimfn <- "cim.png"
png(cimfn, res = 600, width = 8000, height = 4000)
cimDiablo(final.diablo.model, size.legend=0.7)##
## trimming values to [-3, 3] range for cim visualisation. See 'trim' arg in ?cimDiablo
## pdf
## 2
##
## trimming values to [-3, 3] range for cim visualisation. See 'trim' arg in ?cimDiablo
#cutoff for correlations
network(final.diablo.model,
blocks = c(1,2,3),
color.node = c('darkorchid', 'brown1', 'lightgreen'),
cutoff = 0.50,
size.node = 0.05)network(final.diablo.model,
blocks = c(1,2,3),
color.node = c('darkorchid', 'brown1', 'lightgreen'),
cutoff = 0.90,
size.node = 0.10)network(final.diablo.model,
blocks = c(1,2,3),
color.node = c('darkorchid', 'brown1', 'lightgreen'),
cutoff = 0.75,
size.node = 0.10)par(mfrow=c(2,2))
for(i in 1:length(data))
auc.splsda = auroc(final.diablo.model, roc.block = names(data[i]),
roc.comp = 2, print = FALSE)no cut-off
net = network(final.diablo.model,
blocks = c(1,2,3),
color.node = c('darkorchid', 'brown1', 'lightgreen'),
cutoff = 0,
size.node = 0.10)## [1] "list"
## [1] "list"
## [1] "gR" "M_Enz_Met" "M_Enz_Pph" "M_Met_Pph" "cutoff"
## Length Class Mode
## gR 78 igraph list
## M_Enz_Met 546 -none- numeric
## M_Enz_Pph 350 -none- numeric
## M_Met_Pph 975 -none- numeric
## cutoff 1 -none- numeric
## [1] 0
## [1] -0.9405558 0.7732022
## [1] -0.8640388 0.7993718
## [1] -0.7776614 0.8834013
names(dimnames(net$M_Enz_Met)) = c('from','to')
a = as.data.frame(
as.table(net$M_Enz_Met),
responseName = 'CC'
)
a$fromType = 'Enz'
a$toType = 'Met'
a$from = gsub('_Enz$', '', a$from)
a$to = gsub('_Met$', '', a$to)
names(dimnames(net$M_Enz_Pph)) = c('from','to')
b = as.data.frame(
as.table(net$M_Enz_Pph),
responseName = 'CC'
)
b$fromType = 'Enz'
b$toType = 'Pph'
b$from = gsub('_Enz$', '', b$from)
b$to = gsub('_Pph$', '', b$to)
names(dimnames(net$M_Met_Pph)) = c('from','to')
e = as.data.frame(
as.table(net$M_Met_Pph),
responseName = 'CC'
)
e$fromType = 'Met'
e$toType = 'Pph'
e$from = gsub('_Met$', '', e$from)
e$to = gsub('_Pph$', '', e$to)
merged = rbind(a, b, e)
dim(merged)## [1] 1871 5
## IGRAPH 4d99719 UNW- 78 1871 --
## + attr: name (v/c), group (v/c), label.color (v/c), label.family (v/c),
## | label (v/c), color (v/c), shape (v/c), label.cex (v/n), size (v/n),
## | size2 (v/n), weight (e/n), label.color (e/c), color (e/c), lty (e/c),
## | width (e/n), label.cex (e/n)
## [1] 1871 2
write cor
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8
## [2] LC_CTYPE=English_United Kingdom.utf8
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.utf8
##
## time zone: Europe/Ljubljana
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] BiocParallel_1.34.2 mixOmics_6.24.0 ggplot2_3.4.2
## [4] lattice_0.21-8 MASS_7.3-60
##
## loaded via a namespace (and not attached):
## [1] tidyr_1.3.0 sass_0.4.6 utf8_1.2.3 generics_0.1.3
## [5] stringi_1.7.12 digest_0.6.33 magrittr_2.0.3 evaluate_0.21
## [9] grid_4.3.1 RColorBrewer_1.1-3 fastmap_1.1.1 plyr_1.8.8
## [13] jsonlite_1.8.7 Matrix_1.6-0 ggrepel_0.9.3 RSpectra_0.16-1
## [17] gridExtra_2.3 purrr_1.0.1 fansi_1.0.4 scales_1.2.1
## [21] codetools_0.2-19 jquerylib_0.1.4 cli_3.6.1 rlang_1.1.1
## [25] munsell_0.5.0 withr_2.5.0 cachem_1.0.8 yaml_2.3.7
## [29] ellipse_0.5.0 tools_4.3.1 parallel_4.3.1 reshape2_1.4.4
## [33] dplyr_1.1.2 colorspace_2.1-0 corpcor_1.6.10 vctrs_0.6.3
## [37] R6_2.5.1 matrixStats_1.0.0 lifecycle_1.0.3 stringr_1.5.0
## [41] pkgconfig_2.0.3 pillar_1.9.0 bslib_0.5.0 gtable_0.3.3
## [45] glue_1.6.2 rARPACK_0.11-0 Rcpp_1.0.10 highr_0.10
## [49] xfun_0.39 tibble_3.2.1 tidyselect_1.2.0 rstudioapi_0.14
## [53] knitr_1.43 farver_2.1.1 snow_0.4-4 htmltools_0.5.5
## [57] igraph_1.5.0 labeling_0.4.2 rmarkdown_2.23 compiler_4.3.1